# load necceary packages
packages <- c("topicmodels", "MASS", "RTextTools", "stringr", "ggplot2", "tm", "qdap", "tidytext",
"dplyr", "tidyr", "ggthemes", "knitr")
pakcages <- lapply(packages, FUN = function(x) {
if(!require(x, character.only = TRUE)) {
install.packages(x)
library(x, character.only = TRUE)
}
})
## Loading required package: topicmodels
## Loading required package: MASS
## Loading required package: RTextTools
## Loading required package: SparseM
##
## Attaching package: 'SparseM'
## The following object is masked from 'package:base':
##
## backsolve
## Loading required package: stringr
## Loading required package: ggplot2
## Loading required package: tm
## Loading required package: NLP
##
## Attaching package: 'NLP'
## The following object is masked from 'package:ggplot2':
##
## annotate
## Loading required package: qdap
## Loading required package: qdapDictionaries
## Loading required package: qdapRegex
##
## Attaching package: 'qdapRegex'
## The following object is masked from 'package:ggplot2':
##
## %+%
## Loading required package: qdapTools
## Loading required package: RColorBrewer
##
## Attaching package: 'qdap'
## The following objects are masked from 'package:tm':
##
## as.DocumentTermMatrix, as.TermDocumentMatrix
## The following object is masked from 'package:NLP':
##
## ngrams
## The following object is masked from 'package:stringr':
##
## %>%
## The following object is masked from 'package:base':
##
## Filter
## Loading required package: tidytext
## Loading required package: dplyr
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:qdap':
##
## %>%
## The following object is masked from 'package:qdapTools':
##
## id
## The following objects are masked from 'package:qdapRegex':
##
## escape, explain
## The following object is masked from 'package:MASS':
##
## select
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
## Loading required package: tidyr
##
## Attaching package: 'tidyr'
## The following object is masked from 'package:qdap':
##
## %>%
## Loading required package: ggthemes
## Loading required package: knitr
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:qdap':
##
## %>%
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:MASS':
##
## select
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
setwd("~/Google Drive/017Spring/Data Visualization/project")
data <- read.csv("final_data.csv")
review <- read.csv('data_r.csv', fileEncoding = "latin1")
# exclude non-ASCII texts
review$text <- gsub("[^\x20-\x7E]", "", review$text)
# merge attribute data frame and review text file
total <- merge(data, review, by = "business_id", all = FALSE)
# only keep unique texts (delete duplicated ones)
total_u <- distinct(total, text, .keep_all = TRUE)
# choose only downtown and The Strip
total_s <- total_u %>% filter(neighborhood == "Downtown" | neighborhood == "The Strip")
##### clean text #####
total_s$text <- as.character(total_s$text)
str(total_s$text)
## chr [1:31400] "One of the few trucks that is actually pretty reasonably priced, as I think the prices on food trucks are pretty outrageous - o"| __truncated__ ...
text <- as.data.frame(total_s$text)
df_source <- DataframeSource(text[1])
df_corpus <- VCorpus(df_source)
df_corpus
## <<VCorpus>>
## Metadata: corpus specific: 0, document level (indexed): 0
## Content: documents: 31400
# check the contents
df_corpus[[1]][1]
## $content
## [1] "One of the few trucks that is actually pretty reasonably priced, as I think the prices on food trucks are pretty outrageous - one of the reasons why I am not a fan of the food truck phenomenon here in Las Vegas.Sampled the San Leche Cake, which was pretty good! The wings, although breaded, are decent, but I would still prefer them to be naked.As Holly has said, they use very fresh ingredients. Worth giving a try if you're in the mood to chase a food truck."
df_corpus[[13312]][1]
## $content
## [1] "This review is based solely on the cheese danish, which we ordered based on previous reviews. For us, it wasn't much better than something you would get at Panera, and nowhere near the cheese danish from the Bouchon at The Venetian."
# Text cleaning
df_corpus <- tm_map(df_corpus, content_transformer(tolower))
df_corpus <- tm_map(df_corpus, content_transformer(removeWords), c(stopwords("english")))
# df_corpus <- tm_map(df_corpus, content_transformer(removeWords), c("list", "w/"))
df_corpus <- tm_map(df_corpus, content_transformer(removeNumbers))
df_corpus <- tm_map(df_corpus, content_transformer(removePunctuation))
# df_corpus <- tm_map(df_corpus, content_transformer(replace_abbreviation))
# df_corpus <- tm_map(df_corpus, replace_symbol)
# delete non-english characters
## stem document
df_corpus_stem <- tm_map(df_corpus, stemDocument)
df_corpus_stem <- tm_map(df_corpus_stem, stripWhitespace)
# check stemmed document
df_corpus_stem
## <<VCorpus>>
## Metadata: corpus specific: 0, document level (indexed): 0
## Content: documents: 31400
df_corpus_stem[[1]][1]
## $content
## [1] "one truck actual pretti reason price think price food truck pretti outrag one reason fan food truck phenomenon las vegassampl san lech cake pretti good wing although bread decent still prefer nake holli said use fresh ingredi worth give tri mood chase food truck"
df_corpus_stem[[2]][1]
## $content
## [1] "establish definit star great servic uniqu experi chocol match order sunda everi time visit vega sf locat lot smaller even though compani origin las vega locat great staff stage near constant live music order peanut butter fudg sunda past two year warm browni sunda hot fudg sunda realli good sunda get"
# givr row names to data frame to match neighbhorhood in tdm and dtm
total_s$row.names <- 1:nrow(total_s)
# dtm and tdm
tdm <-TermDocumentMatrix(df_corpus_stem)
tdm_td <- tidy(tdm)
tdm_td$neighborhood <- total_s[match(tdm_td[['document']], total_s[['row.names']]), 'neighborhood']
head(tdm_td)
## # A tibble: 6 × 4
## term document count neighborhood
## <chr> <chr> <dbl> <fctr>
## 1 actual 1 1 The Strip
## 2 although 1 1 The Strip
## 3 bread 1 1 The Strip
## 4 cake 1 1 The Strip
## 5 chase 1 1 The Strip
## 6 decent 1 1 The Strip
dtm <- DocumentTermMatrix(df_corpus_stem)
dtm_td <- tidy(dtm)
head(dtm_td)
## # A tibble: 6 × 3
## document term count
## <chr> <chr> <dbl>
## 1 1 actual 1
## 2 1 although 1
## 3 1 bread 1
## 4 1 cake 1
## 5 1 chase 1
## 6 1 decent 1
dtm_td$neighborhood <- total_s[match(dtm_td[['document']], total_s[['row.names']]), 'neighborhood']
head(dtm_td)
## # A tibble: 6 × 4
## document term count neighborhood
## <chr> <chr> <dbl> <fctr>
## 1 1 actual 1 The Strip
## 2 1 although 1 The Strip
## 3 1 bread 1 The Strip
## 4 1 cake 1 The Strip
## 5 1 chase 1 The Strip
## 6 1 decent 1 The Strip
# The bar graph of frquent words used in review of each neighbor
# frequent terms in general
tdm_td %>% group_by(term) %>%
summarise(n = sum(count)) %>%
arrange(desc(n)) %>%
top_n(n = 20, wt = n) %>%
mutate(term = reorder(term, n)) %>%
ggplot(aes(term, n)) +
geom_bar(stat = "identity", fill = "skyblue") +
xlab(NULL) + coord_flip() + theme_tufte() +
ggtitle("The Words Most Frequently Used in Both Neighborhood") +
theme(plot.title = element_text(lineheight=.8, face="bold")) +
ylab("Term Frequency")

# Bind the TF,DF, and IDF frequency
# of a tidy text dataset to the dataset
tf_idf <- tdm_td %>%
bind_tf_idf(term, document, count) %>%
arrange(desc(tf_idf))
tf_idf
## # A tibble: 1,598,232 × 7
## term document count neighborhood tf idf tf_idf
## <chr> <chr> <dbl> <fctr> <dbl> <dbl> <dbl>
## 1 foodnoth 12349 1 The Strip 0.5000000 8.968014 4.484007
## 2 arknoah 9231 1 The Strip 0.3333333 10.354308 3.451436
## 3 ark 9231 1 The Strip 0.3333333 9.661161 3.220387
## 4 noah 9231 1 The Strip 0.3333333 8.408398 2.802799
## 5 mcchicken 14725 1 The Strip 0.2500000 10.354308 2.588577
## 6 preti 24378 1 Downtown 0.2500000 10.354308 2.588577
## 7 phoke 27402 1 The Strip 0.2500000 10.354308 2.588577
## 8 margaritasyum 27737 1 The Strip 0.2500000 10.354308 2.588577
## 9 nom 11338 6 The Strip 0.4000000 6.403065 2.561226
## 10 pizzagreat 24378 1 Downtown 0.2500000 9.661161 2.415290
## # ... with 1,598,222 more rows
tf_idf2 <- tdm_td %>%
bind_tf_idf(term, document, count) %>%
arrange(desc(tf))
# frequent words by used in reviews of each neighborhood
### The Strip
#### by tf
tf_idf2 %>% filter(neighborhood=="The Strip") %>%
arrange(desc(tf)) %>%
top_n(n = 10, wt = tf) %>%
ggplot(aes(x = term, y = tf)) +
geom_bar(stat = "identity", fill = "skyblue") +
xlab(NULL) + ylab("Term Frequency") + coord_flip() + theme_tufte() +
ggtitle("Top 10 Frequent Words Used in Reviews of The Strip") +
theme(plot.title = element_text(lineheight=.8, face="bold"))

#### by tf_idf
tf_idf %>% filter(neighborhood=="The Strip") %>%
top_n(n = 10, wt = tf_idf) %>%
ggplot(aes(x = reorder(term, tf_idf), y = tf_idf)) +
geom_bar(stat = "identity", fill = "skyblue") +
xlab(NULL) + ylab("TF-IDF") + coord_flip() + theme_tufte() +
ggtitle("The Frequent Words Used in The Strip") +
theme(plot.title = element_text(lineheight=.8, face="bold"))

### Downtown
#### by tf
tf_idf2 %>% filter(neighborhood=="Downtown") %>%
#arrange(desc(tf)) %>%
top_n(n = 10, wt = tf) %>%
ggplot(aes(x = term, y = tf)) +
geom_bar(stat = "identity", fill = "skyblue") +
xlab(NULL) + ylab("Term Frequency") + coord_flip() + theme_tufte() +
ggtitle("Top 10 Frequent Words Used in Reviews of Downtown") +
theme(plot.title = element_text(lineheight=.8, face="bold"))

#### by tf_idf
tf_idf %>% filter(neighborhood=="Downtown") %>%
arrange(desc(tf_idf)) %>%
top_n(n = 10, wt = tf_idf) %>%
ggplot(aes(x = reorder(term, tf_idf), y = tf_idf)) +
geom_bar(stat = "identity", fill = "skyblue") +
xlab(NULL) + ylab("TF-IDF") + coord_flip() + theme_tufte() +
ggtitle("The Frequent Words Used in Downtown") +
theme(plot.title = element_text(lineheight=.8, face="bold"))

# comparison
plot <- tf_idf %>%
group_by(neighborhood) %>%
top_n(n = 10, wt = tf_idf) %>%
mutate(key = 1:10)
plot %>% ggplot() +
geom_bar(aes(x = term))

####################### wordcloud #######################
# wordcloud by neighborhood
library(wordcloud)
# Create purple_orange
purple_orange <- brewer.pal(10, "PuOr")
# Drop 2 faintest colors
purple_orange <- purple_orange[-(1:2)]
## The Strip
term_frequency_s <- tf_idf %>% filter(neighborhood=="The Strip")
set.seed(100)
# Create a wordcloud for the review in The Strip
wordcloud(term_frequency_s$term, term_frequency_s$tf,
max.words = 200,
colors= purple_orange)
## Warning in wordcloud(term_frequency_s$term, term_frequency_s$tf, max.words
## = 200, : sandwich could not be fit on page. It will not be plotted.
## Warning in wordcloud(term_frequency_s$term, term_frequency_s$tf, max.words
## = 200, : prop could not be fit on page. It will not be plotted.
## Warning in wordcloud(term_frequency_s$term, term_frequency_s$tf, max.words
## = 200, : breakfast could not be fit on page. It will not be plotted.
## Warning in wordcloud(term_frequency_s$term, term_frequency_s$tf, max.words
## = 200, : great could not be fit on page. It will not be plotted.
## Warning in wordcloud(term_frequency_s$term, term_frequency_s$tf, max.words
## = 200, : expens could not be fit on page. It will not be plotted.
## Warning in wordcloud(term_frequency_s$term, term_frequency_s$tf, max.words
## = 200, : servic could not be fit on page. It will not be plotted.
## Warning in wordcloud(term_frequency_s$term, term_frequency_s$tf, max.words
## = 200, : hookah could not be fit on page. It will not be plotted.
## Warning in wordcloud(term_frequency_s$term, term_frequency_s$tf, max.words
## = 200, : waffl could not be fit on page. It will not be plotted.
## Warning in wordcloud(term_frequency_s$term, term_frequency_s$tf, max.words
## = 200, : pretti could not be fit on page. It will not be plotted.
## Warning in wordcloud(term_frequency_s$term, term_frequency_s$tf, max.words
## = 200, : great could not be fit on page. It will not be plotted.
## Warning in wordcloud(term_frequency_s$term, term_frequency_s$tf, max.words
## = 200, : slow could not be fit on page. It will not be plotted.
## Warning in wordcloud(term_frequency_s$term, term_frequency_s$tf, max.words
## = 200, : good could not be fit on page. It will not be plotted.
## Warning in wordcloud(term_frequency_s$term, term_frequency_s$tf, max.words
## = 200, : best could not be fit on page. It will not be plotted.
## Warning in wordcloud(term_frequency_s$term, term_frequency_s$tf, max.words
## = 200, : good could not be fit on page. It will not be plotted.
## Warning in wordcloud(term_frequency_s$term, term_frequency_s$tf, max.words
## = 200, : price could not be fit on page. It will not be plotted.
## Warning in wordcloud(term_frequency_s$term, term_frequency_s$tf, max.words
## = 200, : great could not be fit on page. It will not be plotted.
## Warning in wordcloud(term_frequency_s$term, term_frequency_s$tf, max.words
## = 200, : marri could not be fit on page. It will not be plotted.
## Warning in wordcloud(term_frequency_s$term, term_frequency_s$tf, max.words
## = 200, : food could not be fit on page. It will not be plotted.
## Warning in wordcloud(term_frequency_s$term, term_frequency_s$tf, max.words
## = 200, : chili could not be fit on page. It will not be plotted.
## Warning in wordcloud(term_frequency_s$term, term_frequency_s$tf, max.words
## = 200, : good could not be fit on page. It will not be plotted.
## Warning in wordcloud(term_frequency_s$term, term_frequency_s$tf, max.words
## = 200, : phoke could not be fit on page. It will not be plotted.
## Warning in wordcloud(term_frequency_s$term, term_frequency_s$tf, max.words
## = 200, : neon could not be fit on page. It will not be plotted.
## Warning in wordcloud(term_frequency_s$term, term_frequency_s$tf, max.words
## = 200, : food could not be fit on page. It will not be plotted.
## Warning in wordcloud(term_frequency_s$term, term_frequency_s$tf, max.words
## = 200, : good could not be fit on page. It will not be plotted.
## Warning in wordcloud(term_frequency_s$term, term_frequency_s$tf, max.words
## = 200, : good could not be fit on page. It will not be plotted.
## Warning in wordcloud(term_frequency_s$term, term_frequency_s$tf, max.words
## = 200, : burger could not be fit on page. It will not be plotted.
## Warning in wordcloud(term_frequency_s$term, term_frequency_s$tf, max.words
## = 200, : servic could not be fit on page. It will not be plotted.
## Warning in wordcloud(term_frequency_s$term, term_frequency_s$tf, max.words
## = 200, : good could not be fit on page. It will not be plotted.
## Warning in wordcloud(term_frequency_s$term, term_frequency_s$tf, max.words
## = 200, : come could not be fit on page. It will not be plotted.
## Warning in wordcloud(term_frequency_s$term, term_frequency_s$tf, max.words
## = 200, : pizza could not be fit on page. It will not be plotted.
## Warning in wordcloud(term_frequency_s$term, term_frequency_s$tf, max.words
## = 200, : chicken could not be fit on page. It will not be plotted.
## Warning in wordcloud(term_frequency_s$term, term_frequency_s$tf, max.words
## = 200, : good could not be fit on page. It will not be plotted.
## Warning in wordcloud(term_frequency_s$term, term_frequency_s$tf, max.words
## = 200, : excel could not be fit on page. It will not be plotted.
## Warning in wordcloud(term_frequency_s$term, term_frequency_s$tf, max.words
## = 200, : bomb could not be fit on page. It will not be plotted.
## Warning in wordcloud(term_frequency_s$term, term_frequency_s$tf, max.words
## = 200, : egg could not be fit on page. It will not be plotted.
## Warning in wordcloud(term_frequency_s$term, term_frequency_s$tf, max.words
## = 200, : great could not be fit on page. It will not be plotted.
## Warning in wordcloud(term_frequency_s$term, term_frequency_s$tf, max.words
## = 200, : love could not be fit on page. It will not be plotted.
## Warning in wordcloud(term_frequency_s$term, term_frequency_s$tf, max.words
## = 200, : burger could not be fit on page. It will not be plotted.
## Warning in wordcloud(term_frequency_s$term, term_frequency_s$tf, max.words
## = 200, : cocktail could not be fit on page. It will not be plotted.
## Warning in wordcloud(term_frequency_s$term, term_frequency_s$tf, max.words
## = 200, : love could not be fit on page. It will not be plotted.
## Warning in wordcloud(term_frequency_s$term, term_frequency_s$tf, max.words
## = 200, : close could not be fit on page. It will not be plotted.
## Warning in wordcloud(term_frequency_s$term, term_frequency_s$tf, max.words
## = 200, : best could not be fit on page. It will not be plotted.
## Warning in wordcloud(term_frequency_s$term, term_frequency_s$tf, max.words
## = 200, : great could not be fit on page. It will not be plotted.
## Warning in wordcloud(term_frequency_s$term, term_frequency_s$tf, max.words
## = 200, : wrong could not be fit on page. It will not be plotted.
## Warning in wordcloud(term_frequency_s$term, term_frequency_s$tf, max.words
## = 200, : gross could not be fit on page. It will not be plotted.
## Warning in wordcloud(term_frequency_s$term, term_frequency_s$tf, max.words
## = 200, : wing could not be fit on page. It will not be plotted.
## Warning in wordcloud(term_frequency_s$term, term_frequency_s$tf, max.words
## = 200, : great could not be fit on page. It will not be plotted.
## Warning in wordcloud(term_frequency_s$term, term_frequency_s$tf, max.words
## = 200, : spici could not be fit on page. It will not be plotted.
## Warning in wordcloud(term_frequency_s$term, term_frequency_s$tf, max.words
## = 200, : fritter could not be fit on page. It will not be plotted.
## Warning in wordcloud(term_frequency_s$term, term_frequency_s$tf, max.words
## = 200, : want could not be fit on page. It will not be plotted.
## Warning in wordcloud(term_frequency_s$term, term_frequency_s$tf, max.words
## = 200, : smore could not be fit on page. It will not be plotted.
## Warning in wordcloud(term_frequency_s$term, term_frequency_s$tf, max.words
## = 200, : ever could not be fit on page. It will not be plotted.
## Warning in wordcloud(term_frequency_s$term, term_frequency_s$tf, max.words
## = 200, : cooki could not be fit on page. It will not be plotted.
## Warning in wordcloud(term_frequency_s$term, term_frequency_s$tf, max.words
## = 200, : perfect could not be fit on page. It will not be plotted.
## Warning in wordcloud(term_frequency_s$term, term_frequency_s$tf, max.words
## = 200, : pool could not be fit on page. It will not be plotted.
## Warning in wordcloud(term_frequency_s$term, term_frequency_s$tf, max.words
## = 200, : great could not be fit on page. It will not be plotted.
## Warning in wordcloud(term_frequency_s$term, term_frequency_s$tf, max.words
## = 200, : meat could not be fit on page. It will not be plotted.
## Warning in wordcloud(term_frequency_s$term, term_frequency_s$tf, max.words
## = 200, : sandwich could not be fit on page. It will not be plotted.
## Warning in wordcloud(term_frequency_s$term, term_frequency_s$tf, max.words
## = 200, : place could not be fit on page. It will not be plotted.
## Warning in wordcloud(term_frequency_s$term, term_frequency_s$tf, max.words
## = 200, : good could not be fit on page. It will not be plotted.
## Warning in wordcloud(term_frequency_s$term, term_frequency_s$tf, max.words
## = 200, : best could not be fit on page. It will not be plotted.
## Warning in wordcloud(term_frequency_s$term, term_frequency_s$tf, max.words
## = 200, : irish could not be fit on page. It will not be plotted.
## Warning in wordcloud(term_frequency_s$term, term_frequency_s$tf, max.words
## = 200, : hot could not be fit on page. It will not be plotted.
## Warning in wordcloud(term_frequency_s$term, term_frequency_s$tf, max.words
## = 200, : great could not be fit on page. It will not be plotted.
## Warning in wordcloud(term_frequency_s$term, term_frequency_s$tf, max.words
## = 200, : order could not be fit on page. It will not be plotted.
## Warning in wordcloud(term_frequency_s$term, term_frequency_s$tf, max.words
## = 200, : great could not be fit on page. It will not be plotted.
## Warning in wordcloud(term_frequency_s$term, term_frequency_s$tf, max.words
## = 200, : nom could not be fit on page. It will not be plotted.
## Warning in wordcloud(term_frequency_s$term, term_frequency_s$tf, max.words
## = 200, : ark could not be fit on page. It will not be plotted.
## Warning in wordcloud(term_frequency_s$term, term_frequency_s$tf, max.words
## = 200, : food could not be fit on page. It will not be plotted.
## Warning in wordcloud(term_frequency_s$term, term_frequency_s$tf, max.words
## = 200, : food could not be fit on page. It will not be plotted.
## Warning in wordcloud(term_frequency_s$term, term_frequency_s$tf, max.words
## = 200, : pleasant could not be fit on page. It will not be plotted.
## Warning in wordcloud(term_frequency_s$term, term_frequency_s$tf, max.words
## = 200, : eat could not be fit on page. It will not be plotted.
## Warning in wordcloud(term_frequency_s$term, term_frequency_s$tf, max.words
## = 200, : great could not be fit on page. It will not be plotted.
## Warning in wordcloud(term_frequency_s$term, term_frequency_s$tf, max.words
## = 200, : impress could not be fit on page. It will not be plotted.
## Warning in wordcloud(term_frequency_s$term, term_frequency_s$tf, max.words
## = 200, : lemon could not be fit on page. It will not be plotted.
## Warning in wordcloud(term_frequency_s$term, term_frequency_s$tf, max.words
## = 200, : ham could not be fit on page. It will not be plotted.
## Warning in wordcloud(term_frequency_s$term, term_frequency_s$tf, max.words
## = 200, : medium could not be fit on page. It will not be plotted.
## Warning in wordcloud(term_frequency_s$term, term_frequency_s$tf, max.words
## = 200, : food could not be fit on page. It will not be plotted.
## Warning in wordcloud(term_frequency_s$term, term_frequency_s$tf, max.words
## = 200, : rave could not be fit on page. It will not be plotted.
## Warning in wordcloud(term_frequency_s$term, term_frequency_s$tf, max.words
## = 200, : egg could not be fit on page. It will not be plotted.
## Warning in wordcloud(term_frequency_s$term, term_frequency_s$tf, max.words
## = 200, : good could not be fit on page. It will not be plotted.
## Warning in wordcloud(term_frequency_s$term, term_frequency_s$tf, max.words
## = 200, : buffet could not be fit on page. It will not be plotted.
## Warning in wordcloud(term_frequency_s$term, term_frequency_s$tf, max.words
## = 200, : star could not be fit on page. It will not be plotted.
## Warning in wordcloud(term_frequency_s$term, term_frequency_s$tf, max.words
## = 200, : price could not be fit on page. It will not be plotted.
## Warning in wordcloud(term_frequency_s$term, term_frequency_s$tf, max.words
## = 200, : amaz could not be fit on page. It will not be plotted.
## Warning in wordcloud(term_frequency_s$term, term_frequency_s$tf, max.words
## = 200, : love could not be fit on page. It will not be plotted.
## Warning in wordcloud(term_frequency_s$term, term_frequency_s$tf, max.words
## = 200, : casino could not be fit on page. It will not be plotted.
## Warning in wordcloud(term_frequency_s$term, term_frequency_s$tf, max.words
## = 200, : like could not be fit on page. It will not be plotted.
## Warning in wordcloud(term_frequency_s$term, term_frequency_s$tf, max.words
## = 200, : burger could not be fit on page. It will not be plotted.
## Warning in wordcloud(term_frequency_s$term, term_frequency_s$tf, max.words
## = 200, : love could not be fit on page. It will not be plotted.
## Warning in wordcloud(term_frequency_s$term, term_frequency_s$tf, max.words
## = 200, : noah could not be fit on page. It will not be plotted.
## Warning in wordcloud(term_frequency_s$term, term_frequency_s$tf, max.words
## = 200, : good could not be fit on page. It will not be plotted.
## Warning in wordcloud(term_frequency_s$term, term_frequency_s$tf, max.words
## = 200, : burger could not be fit on page. It will not be plotted.
## Warning in wordcloud(term_frequency_s$term, term_frequency_s$tf, max.words
## = 200, : tasti could not be fit on page. It will not be plotted.
## Warning in wordcloud(term_frequency_s$term, term_frequency_s$tf, max.words
## = 200, : star could not be fit on page. It will not be plotted.
## Warning in wordcloud(term_frequency_s$term, term_frequency_s$tf, max.words
## = 200, : food could not be fit on page. It will not be plotted.
## Warning in wordcloud(term_frequency_s$term, term_frequency_s$tf, max.words
## = 200, : love could not be fit on page. It will not be plotted.
## Warning in wordcloud(term_frequency_s$term, term_frequency_s$tf, max.words
## = 200, : chocol could not be fit on page. It will not be plotted.
## Warning in wordcloud(term_frequency_s$term, term_frequency_s$tf, max.words
## = 200, : good could not be fit on page. It will not be plotted.
## Warning in wordcloud(term_frequency_s$term, term_frequency_s$tf, max.words
## = 200, : prosciutto could not be fit on page. It will not be plotted.
## Warning in wordcloud(term_frequency_s$term, term_frequency_s$tf, max.words
## = 200, : food could not be fit on page. It will not be plotted.
## Warning in wordcloud(term_frequency_s$term, term_frequency_s$tf, max.words
## = 200, : mass could not be fit on page. It will not be plotted.
## Warning in wordcloud(term_frequency_s$term, term_frequency_s$tf, max.words
## = 200, : memor could not be fit on page. It will not be plotted.
## Warning in wordcloud(term_frequency_s$term, term_frequency_s$tf, max.words
## = 200, : best could not be fit on page. It will not be plotted.
## Warning in wordcloud(term_frequency_s$term, term_frequency_s$tf, max.words
## = 200, : pizza could not be fit on page. It will not be plotted.
## Warning in wordcloud(term_frequency_s$term, term_frequency_s$tf, max.words
## = 200, : alway could not be fit on page. It will not be plotted.
## Warning in wordcloud(term_frequency_s$term, term_frequency_s$tf, max.words
## = 200, : mani could not be fit on page. It will not be plotted.
## Warning in wordcloud(term_frequency_s$term, term_frequency_s$tf, max.words
## = 200, : good could not be fit on page. It will not be plotted.
## Warning in wordcloud(term_frequency_s$term, term_frequency_s$tf, max.words
## = 200, : good could not be fit on page. It will not be plotted.
## Warning in wordcloud(term_frequency_s$term, term_frequency_s$tf, max.words
## = 200, : love could not be fit on page. It will not be plotted.
## Warning in wordcloud(term_frequency_s$term, term_frequency_s$tf, max.words
## = 200, : amaz could not be fit on page. It will not be plotted.
## Warning in wordcloud(term_frequency_s$term, term_frequency_s$tf, max.words
## = 200, : sit could not be fit on page. It will not be plotted.
## Warning in wordcloud(term_frequency_s$term, term_frequency_s$tf, max.words
## = 200, : open could not be fit on page. It will not be plotted.
## Warning in wordcloud(term_frequency_s$term, term_frequency_s$tf, max.words
## = 200, : great could not be fit on page. It will not be plotted.
## Warning in wordcloud(term_frequency_s$term, term_frequency_s$tf, max.words
## = 200, : like could not be fit on page. It will not be plotted.
## Warning in wordcloud(term_frequency_s$term, term_frequency_s$tf, max.words
## = 200, : appeal could not be fit on page. It will not be plotted.
## Warning in wordcloud(term_frequency_s$term, term_frequency_s$tf, max.words
## = 200, : song could not be fit on page. It will not be plotted.
## Warning in wordcloud(term_frequency_s$term, term_frequency_s$tf, max.words
## = 200, : vega could not be fit on page. It will not be plotted.
## Warning in wordcloud(term_frequency_s$term, term_frequency_s$tf, max.words
## = 200, : love could not be fit on page. It will not be plotted.
## Warning in wordcloud(term_frequency_s$term, term_frequency_s$tf, max.words
## = 200, : buffet could not be fit on page. It will not be plotted.
## Warning in wordcloud(term_frequency_s$term, term_frequency_s$tf, max.words
## = 200, : lobster could not be fit on page. It will not be plotted.
## Warning in wordcloud(term_frequency_s$term, term_frequency_s$tf, max.words
## = 200, : great could not be fit on page. It will not be plotted.
## Warning in wordcloud(term_frequency_s$term, term_frequency_s$tf, max.words
## = 200, : spaghetti could not be fit on page. It will not be plotted.
## Warning in wordcloud(term_frequency_s$term, term_frequency_s$tf, max.words
## = 200, : pizza could not be fit on page. It will not be plotted.
## Warning in wordcloud(term_frequency_s$term, term_frequency_s$tf, max.words
## = 200, : pack could not be fit on page. It will not be plotted.
## Warning in wordcloud(term_frequency_s$term, term_frequency_s$tf, max.words
## = 200, : oyster could not be fit on page. It will not be plotted.
## Warning in wordcloud(term_frequency_s$term, term_frequency_s$tf, max.words
## = 200, : food could not be fit on page. It will not be plotted.
## Warning in wordcloud(term_frequency_s$term, term_frequency_s$tf, max.words
## = 200, : much could not be fit on page. It will not be plotted.
## Warning in wordcloud(term_frequency_s$term, term_frequency_s$tf, max.words
## = 200, : nice could not be fit on page. It will not be plotted.
## Warning in wordcloud(term_frequency_s$term, term_frequency_s$tf, max.words
## = 200, : lobster could not be fit on page. It will not be plotted.
## Warning in wordcloud(term_frequency_s$term, term_frequency_s$tf, max.words
## = 200, : mason could not be fit on page. It will not be plotted.
## Warning in wordcloud(term_frequency_s$term, term_frequency_s$tf, max.words
## = 200, : now could not be fit on page. It will not be plotted.
## Warning in wordcloud(term_frequency_s$term, term_frequency_s$tf, max.words
## = 200, : everyth could not be fit on page. It will not be plotted.
## Warning in wordcloud(term_frequency_s$term, term_frequency_s$tf, max.words
## = 200, : great could not be fit on page. It will not be plotted.
## Warning in wordcloud(term_frequency_s$term, term_frequency_s$tf, max.words
## = 200, : arknoah could not be fit on page. It will not be plotted.
## Warning in wordcloud(term_frequency_s$term, term_frequency_s$tf, max.words
## = 200, : foodnoth could not be fit on page. It will not be plotted.
## Warning in wordcloud(term_frequency_s$term, term_frequency_s$tf, max.words
## = 200, : margaritasyum could not be fit on page. It will not be plotted.
## Warning in wordcloud(term_frequency_s$term, term_frequency_s$tf, max.words
## = 200, : good could not be fit on page. It will not be plotted.
## Warning in wordcloud(term_frequency_s$term, term_frequency_s$tf, max.words
## = 200, : burger could not be fit on page. It will not be plotted.
## Warning in wordcloud(term_frequency_s$term, term_frequency_s$tf, max.words
## = 200, : star could not be fit on page. It will not be plotted.
## Warning in wordcloud(term_frequency_s$term, term_frequency_s$tf, max.words
## = 200, : great could not be fit on page. It will not be plotted.
## Warning in wordcloud(term_frequency_s$term, term_frequency_s$tf, max.words
## = 200, : food could not be fit on page. It will not be plotted.

### includes words "overpriced"
## Downtown
term_frequency_DT <- tf_idf %>% filter(neighborhood=="Downtown")
set.seed(213)
# Create a wordcloud for the review in Downtown
wordcloud(term_frequency_DT$term, term_frequency_DT$tf,
max.words = 100, colors = purple_orange)
## Warning in wordcloud(term_frequency_DT$term, term_frequency_DT$tf,
## max.words = 100, : doughnut could not be fit on page. It will not be
## plotted.
## Warning in wordcloud(term_frequency_DT$term, term_frequency_DT$tf,
## max.words = 100, : garbag could not be fit on page. It will not be plotted.
## Warning in wordcloud(term_frequency_DT$term, term_frequency_DT$tf,
## max.words = 100, : pizza could not be fit on page. It will not be plotted.
## Warning in wordcloud(term_frequency_DT$term, term_frequency_DT$tf,
## max.words = 100, : yummi could not be fit on page. It will not be plotted.
## Warning in wordcloud(term_frequency_DT$term, term_frequency_DT$tf,
## max.words = 100, : outlet could not be fit on page. It will not be plotted.
## Warning in wordcloud(term_frequency_DT$term, term_frequency_DT$tf,
## max.words = 100, : lotsa could not be fit on page. It will not be plotted.
## Warning in wordcloud(term_frequency_DT$term, term_frequency_DT$tf,
## max.words = 100, : bathroom could not be fit on page. It will not be
## plotted.
## Warning in wordcloud(term_frequency_DT$term, term_frequency_DT$tf,
## max.words = 100, : lunch could not be fit on page. It will not be plotted.
## Warning in wordcloud(term_frequency_DT$term, term_frequency_DT$tf,
## max.words = 100, : pizzagreat could not be fit on page. It will not be
## plotted.
## Warning in wordcloud(term_frequency_DT$term, term_frequency_DT$tf,
## max.words = 100, : great could not be fit on page. It will not be plotted.
## Warning in wordcloud(term_frequency_DT$term, term_frequency_DT$tf,
## max.words = 100, : time could not be fit on page. It will not be plotted.
## Warning in wordcloud(term_frequency_DT$term, term_frequency_DT$tf,
## max.words = 100, : place could not be fit on page. It will not be plotted.
## Warning in wordcloud(term_frequency_DT$term, term_frequency_DT$tf,
## max.words = 100, : great could not be fit on page. It will not be plotted.
## Warning in wordcloud(term_frequency_DT$term, term_frequency_DT$tf,
## max.words = 100, : great could not be fit on page. It will not be plotted.
## Warning in wordcloud(term_frequency_DT$term, term_frequency_DT$tf,
## max.words = 100, : place could not be fit on page. It will not be plotted.
## Warning in wordcloud(term_frequency_DT$term, term_frequency_DT$tf,
## max.words = 100, : dough could not be fit on page. It will not be plotted.
## Warning in wordcloud(term_frequency_DT$term, term_frequency_DT$tf,
## max.words = 100, : great could not be fit on page. It will not be plotted.
## Warning in wordcloud(term_frequency_DT$term, term_frequency_DT$tf,
## max.words = 100, : nice could not be fit on page. It will not be plotted.
## Warning in wordcloud(term_frequency_DT$term, term_frequency_DT$tf,
## max.words = 100, : share could not be fit on page. It will not be plotted.
## Warning in wordcloud(term_frequency_DT$term, term_frequency_DT$tf,
## max.words = 100, : great could not be fit on page. It will not be plotted.
## Warning in wordcloud(term_frequency_DT$term, term_frequency_DT$tf,
## max.words = 100, : pizza could not be fit on page. It will not be plotted.
## Warning in wordcloud(term_frequency_DT$term, term_frequency_DT$tf,
## max.words = 100, : coney could not be fit on page. It will not be plotted.
## Warning in wordcloud(term_frequency_DT$term, term_frequency_DT$tf,
## max.words = 100, : order could not be fit on page. It will not be plotted.
## Warning in wordcloud(term_frequency_DT$term, term_frequency_DT$tf,
## max.words = 100, : fukuburg could not be fit on page. It will not be
## plotted.
## Warning in wordcloud(term_frequency_DT$term, term_frequency_DT$tf,
## max.words = 100, : waitress could not be fit on page. It will not be
## plotted.
## Warning in wordcloud(term_frequency_DT$term, term_frequency_DT$tf,
## max.words = 100, : number could not be fit on page. It will not be plotted.
## Warning in wordcloud(term_frequency_DT$term, term_frequency_DT$tf,
## max.words = 100, : period could not be fit on page. It will not be plotted.
## Warning in wordcloud(term_frequency_DT$term, term_frequency_DT$tf,
## max.words = 100, : pizza could not be fit on page. It will not be plotted.
## Warning in wordcloud(term_frequency_DT$term, term_frequency_DT$tf,
## max.words = 100, : food could not be fit on page. It will not be plotted.
## Warning in wordcloud(term_frequency_DT$term, term_frequency_DT$tf,
## max.words = 100, : cheap could not be fit on page. It will not be plotted.
## Warning in wordcloud(term_frequency_DT$term, term_frequency_DT$tf,
## max.words = 100, : see could not be fit on page. It will not be plotted.

#### includes the world "cheap"
########### Sentiment #############
setwd("~/Google Drive/017Spring/Data Visualization/hw3")
pos <- read.table("positive-words.txt", as.is=T)
neg <- read.table("negative-words.txt", as.is=T)
library(quanteda)
## quanteda version 0.9.9.24
## Using 7 of 8 cores for parallel computing
##
## Attaching package: 'quanteda'
## The following objects are masked from 'package:qdap':
##
## as.DocumentTermMatrix, as.wfm, ngrams, weight
## The following objects are masked from 'package:tm':
##
## as.DocumentTermMatrix, stopwords
## The following object is masked from 'package:NLP':
##
## ngrams
## The following object is masked from 'package:utils':
##
## View
## The following object is masked from 'package:base':
##
## sample
setwd("~/Google Drive/017Spring/Data Visualization/project")
sentiment <- function(words=c("really great good stuff bad")){
require(quanteda)
tok <- quanteda::tokenize(words)
pos.count <- sum(tok[[1]]%in%pos[,1])
neg.count <- sum(tok[[1]]%in%neg[,1])
out <- (pos.count - neg.count)/(pos.count+neg.count)
return(out)
}
Strip <- total_s %>% filter(neighborhood == "The Strip")
Downtown <- total_s %>% filter(neighborhood == "Downtown")
Sent_s <- data.frame(matrix(0, ncol = 1, nrow = nrow(Strip)))
colnames(Sent_s)[1] <- "sent"
Sent_s$text <- Strip$text
Sent_s$star <- Strip$stars
Sent_d <- data.frame(matrix(0, ncol = 1, nrow = nrow(Downtown)))
Sent_d$text <- Downtown$text
colnames(Sent_d)[1] <- "sent"
Sent_d$star <- Downtown$stars
for(i in 1:nrow(Strip)) {
Sent_s[[i, 1]] <- sentiment(Strip[[i, 495]])
}
for(i in 1:nrow(Downtown)) {
Sent_d[[i,1]] <- sentiment(Downtown[[i, 495]])
}
sentiment(Strip[[7, 495]])
## [1] 1
sentiment(Downtown$text)
## [1] 1
Sent_d$neigh <- "Downtown"
Sent_d$neigh <- factor(Sent_d$neigh)
Sent_s$neigh <- "The Strip"
Sent_s$neigh <- factor(Sent_s$neigh)
Neighborhood <- rbind(Sent_d, Sent_s)
library(plyr)
## -------------------------------------------------------------------------
## You have loaded plyr after dplyr - this is likely to cause problems.
## If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
## library(plyr); library(dplyr)
## -------------------------------------------------------------------------
##
## Attaching package: 'plyr'
## The following objects are masked from 'package:plotly':
##
## arrange, mutate, rename, summarise
## The following objects are masked from 'package:dplyr':
##
## arrange, count, desc, failwith, id, mutate, rename, summarise,
## summarize
## The following object is masked from 'package:qdapTools':
##
## id
Neighborhood$rate <- "low"
Neighborhood$rate <- ifelse(Neighborhood$star < 2.5, "low",
ifelse(Neighborhood$star < 4, "mid", "high"))
table(Neighborhood$rate)
##
## high low mid
## 16627 284 14489
Neighborhood$rate <- factor(Neighborhood$rate, levels = c('low', 'mid', 'high'))
str(Neighborhood$rate)
## Factor w/ 3 levels "low","mid","high": 3 3 3 3 3 2 3 3 3 3 ...
library(plyr)
table(Neighborhood$neigh)
##
## Downtown The Strip
## 4332 27068
names(Neighborhood)[4] <- "neigh"
p <- ggplot(Neighborhood, aes(x = neigh, y = sent, fill = neigh)) + geom_boxplot()
p <- p + scale_x_discrete(name = "") +
scale_y_continuous(name = "Sentiment Score") +
ggtitle("Sentiment Score of Reviews in Each Neighbhorhood") +
labs(fill = "Neighbhoorhood") + coord_flip() +
theme_tufte()
#theme(axis.text.y=element_blank())
ggplotly(p)
## Warning: We recommend that you use the dev version of ggplot2 with `ggplotly()`
## Install it with: `devtools::install_github('hadley/ggplot2')`
## Warning: Removed 418 rows containing non-finite values (stat_boxplot).
labels <- c(high = "High [4, 5]", mid = "Mid [2.5, 3.5]", low = "Low [1, 2]")
p2 <- ggplot(Neighborhood, aes(x = neigh, y = sent, fill = neigh)) + geom_boxplot() +
theme_tufte() +
#theme(axis.text.x=element_blank()) +
labs(fill = "Neighbhoorhood") +
scale_x_discrete(name = "") +
scale_y_continuous(name = "Sentiment Score") +
ggtitle("Sentiment Score of Reviews of Each Neighbhorhood by Rating")
p2 <- p2 + facet_wrap(~rate, labeller=labeller(rate = labels))
ggplotly(p2)
## Warning: We recommend that you use the dev version of ggplot2 with `ggplotly()`
## Install it with: `devtools::install_github('hadley/ggplot2')`
## Warning: Removed 418 rows containing non-finite values (stat_boxplot).